Notebook dedicado a la extracción de los metadatos geograficos y temporales. En el caso de los datos geográficos se enriquecerán aportando adicionalmente (distancia de residencia habitual, municipio y provincia)
import math
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
# Packages propios
from utils.graphdbmanipulation import ImagesGraphDB
from utils.imagesmanipulation import ImageHelper
# Rutas de interés
PATH_IMAGENES_A_ANALIZAR = "c:/Users/dcsj/Repositorios/UOC_Master_DS_TFM/image_classifier/resources/images/Movil-S21"
PATH_GML_FILE = 'c:/Users/dcsj/Repositorios/UOC_Master_DS_TFM/image_classifier/outputs/graph_databases/graph_GEO_TIME.gml'
# Coordenadas de residencia habitual
RESIDENCE_LATITUDE = 41.462435251362294
RESIDENCE_LONGITUDE = 2.203910532194043
Se van a extraer los datos del gps (latitud y longitud) de las imágenes en caso de que dispongan de esa información y la fecha de realización de la foto. A partir de la información de latitud y longitud, mediante el servicio https://nominatim.openstreetmap.org/reverse se van a obtener diversos datos adicionales:
Además, con los datos de longitud y latitud extraidos se calculará la distancia en km respecto de la que es considerada residencia habitual. Por último, se realizará la extracción de la fecha y hora, día, mes y año como información necesaria para estudios de frecuencias y periodos que puedan ayudar en la determinación de álbumes.
# Se crea el objeto con la clase que habilita la extracción y enriquecimiento de datos
imagehelper = ImageHelper()
geo_date_data = imagehelper.get_geo_and_date_images_data(PATH_IMAGENES_A_ANALIZAR,RESIDENCE_LATITUDE,RESIDENCE_LONGITUDE)
geo_date_data.head(10)
# Se guarda en un fichero csv el dataframe generado
geo_date_data.to_csv('.\outputs\intermediate_csv_files\geo_date_data.csv')
geo_date_data = pd.read_csv('.\outputs\intermediate_csv_files\geo_date_data.csv',header=0)
geo_date_data.head(10)
| Unnamed: 0 | filename | latitude | longitude | city | postcode | province | country | distance_km | datestring | day | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 20210724_100218.jpg | 41.458438 | 2.204826 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.450968 | 2021:07:24 10:02:18 | 24 | 7 | 2021 |
| 1 | 1 | 20210724_104730.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:31 | 24 | 7 | 2021 |
| 2 | 2 | 20210724_104736.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:36 | 24 | 7 | 2021 |
| 3 | 3 | 20210724_104739.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:39 | 24 | 7 | 2021 |
| 4 | 4 | 20210724_104743.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:44 | 24 | 7 | 2021 |
| 5 | 5 | 20210724_104757.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:57 | 24 | 7 | 2021 |
| 6 | 6 | 20210724_200143.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230864 | 2021:07:24 20:01:43 | 24 | 7 | 2021 |
| 7 | 7 | 20210724_200145.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230864 | 2021:07:24 20:01:45 | 24 | 7 | 2021 |
| 8 | 8 | 20210724_200231_02.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230859 | 2021:07:24 20:02:31 | 24 | 7 | 2021 |
| 9 | 9 | 20210724_200231_03.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230859 | 2021:07:24 20:02:31 | 24 | 7 | 2021 |
from datetime import datetime
# Se incorpora la información en un grafo y se guarda en un fichero gml
geo_date_data = pd.read_csv('.\outputs\intermediate_csv_files\geo_date_data.csv',header=0)
imagesGraphDB = ImagesGraphDB()
imagesGraphDB.add_node('early_morning')
imagesGraphDB.set_attribute_to_node('early_morning','type','franja')
imagesGraphDB.add_node('morning')
imagesGraphDB.set_attribute_to_node('morning','type','franja')
imagesGraphDB.add_node('afternoon')
imagesGraphDB.set_attribute_to_node('afternoon','type','franja')
imagesGraphDB.add_node('night')
imagesGraphDB.set_attribute_to_node('night','type','franja')
for row in geo_date_data.itertuples():
#nodo imagen
node = str(row.filename)
imagesGraphDB.add_node(row.filename)
imagesGraphDB.set_attribute_to_node(row.filename,'type','image_filename')
imagesGraphDB.set_attribute_to_node(row.filename,'time',row.datestring)
imagesGraphDB.set_attribute_to_node(row.filename,'distance_km',-1) # se inicializa el atributo distance_km a -1
# para distinguir aquellas imágenes sin coordenadas nodo día
date_time = datetime.strptime(row.datestring, '%Y:%m:%d %H:%M:%S')
date = str(date_time.year)+"_"+ str(date_time.month)+"_"+ str(date_time.day)
imagesGraphDB.add_node(date)
imagesGraphDB.set_attribute_to_node(date,'type','fecha')
imagesGraphDB.add_edge(row.filename,date)
# relación con la franja horaria de la imagen
hour = date_time.hour
if hour>=0 and hour <=6:
franja = 'early_morning'
elif hour>6 and hour<12:
franja ='morning'
elif hour>=12 and hour<=18:
franja = 'afternoon'
else:
franja = 'night'
imagesGraphDB.add_edge(row.filename,franja)
#nodo mes
imagesGraphDB.add_node(row.month)
imagesGraphDB.add_edge(row.filename,row.month)
imagesGraphDB.set_attribute_to_node(row.month,'type','mes')
#nodo año
imagesGraphDB.add_node(row.year)
imagesGraphDB.add_edge(row.filename,row.year)
imagesGraphDB.set_attribute_to_node(row.year,'type','anyo')
if float(row.distance_km)>0:
# Se añaden los atributos geográficos al nodo de imagen
imagesGraphDB.set_attribute_to_node(row.filename,'distance_km',str(row.distance_km))
imagesGraphDB.set_attribute_to_node(row.filename,'latitude',row.latitude)
imagesGraphDB.set_attribute_to_node(row.filename,'longitude',row.longitude)
# nodo municipio
imagesGraphDB.add_node(row.city)
imagesGraphDB.set_attribute_to_node(row.city,'type','municipio')
imagesGraphDB.add_edge(row.filename,row.city)
#nodo provincia
imagesGraphDB.add_node(row.province)
imagesGraphDB.add_edge(row.filename,row.province)
imagesGraphDB.set_attribute_to_node(row.province,'type','provincia')
#nodo país
imagesGraphDB.add_node(row.country)
imagesGraphDB.add_edge(row.filename,row.country)
imagesGraphDB.set_attribute_to_node(row.country,'type','pais')
#nodo código postal
imagesGraphDB.add_node(row.postcode)
imagesGraphDB.add_edge(row.filename,row.postcode)
imagesGraphDB.set_attribute_to_node(row.postcode,'type','CP')
imagesGraphDB.write_gml_file(PATH_GML_FILE)
# Carga de resultados desde fichero csv
geo_date_data = pd.read_csv('.\outputs\intermediate_csv_files\geo_date_data.csv',header=0)
geo_date_data.head(10)
| Unnamed: 0 | filename | latitude | longitude | city | postcode | province | country | distance_km | datestring | day | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 20210724_100218.jpg | 41.458438 | 2.204826 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.450968 | 2021:07:24 10:02:18 | 24 | 7 | 2021 |
| 1 | 1 | 20210724_104730.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:31 | 24 | 7 | 2021 |
| 2 | 2 | 20210724_104736.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:36 | 24 | 7 | 2021 |
| 3 | 3 | 20210724_104739.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:39 | 24 | 7 | 2021 |
| 4 | 4 | 20210724_104743.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:44 | 24 | 7 | 2021 |
| 5 | 5 | 20210724_104757.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:57 | 24 | 7 | 2021 |
| 6 | 6 | 20210724_200143.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230864 | 2021:07:24 20:01:43 | 24 | 7 | 2021 |
| 7 | 7 | 20210724_200145.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230864 | 2021:07:24 20:01:45 | 24 | 7 | 2021 |
| 8 | 8 | 20210724_200231_02.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230859 | 2021:07:24 20:02:31 | 24 | 7 | 2021 |
| 9 | 9 | 20210724_200231_03.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230859 | 2021:07:24 20:02:31 | 24 | 7 | 2021 |
# Filtrado de resultados para obtener solo los registros con datos de longitud y latitud
images_with_location = geo_date_data.loc[geo_date_data['latitude']>0]
images_with_location
| Unnamed: 0 | filename | latitude | longitude | city | postcode | province | country | distance_km | datestring | day | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 20210724_100218.jpg | 41.458438 | 2.204826 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.450968 | 2021:07:24 10:02:18 | 24 | 7 | 2021 |
| 1 | 1 | 20210724_104730.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:31 | 24 | 7 | 2021 |
| 2 | 2 | 20210724_104736.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:36 | 24 | 7 | 2021 |
| 3 | 3 | 20210724_104739.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:39 | 24 | 7 | 2021 |
| 4 | 4 | 20210724_104743.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:44 | 24 | 7 | 2021 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 923 | 923 | 20220521_172319.jpg | 41.346790 | 2.077990 | unknown_city | 08820 | Barcelona | España | 16.602778 | 2022:05:21 17:23:19 | 21 | 5 | 2022 |
| 924 | 924 | 20220521_172320.jpg | 41.346790 | 2.077990 | unknown_city | 08820 | Barcelona | España | 16.602778 | 2022:05:21 17:23:21 | 21 | 5 | 2022 |
| 925 | 925 | 20220521_173555.jpg | 41.346774 | 2.077726 | unknown_city | 08820 | Barcelona | España | 16.618153 | 2022:05:21 17:35:55 | 21 | 5 | 2022 |
| 1363 | 1363 | IMG_20180822_173932.jpg | 45.659504 | 10.048676 | Iseo | 25049 | Lombardia | Italia | 785.110217 | 2018:08:22 17:39:32 | 22 | 8 | 2018 |
| 1364 | 1364 | IMG_20180824_195613.jpg | 43.295361 | 5.363323 | Marseille | 13002 | Provence-Alpes-Côte d'Azur | France | 329.938580 | 2018:08:24 19:56:13 | 24 | 8 | 2018 |
926 rows × 13 columns
# Se trunca el valor de latitud y longitud a un decimal para hacer mapa de calor
truncate_factor = 10
images_with_location_truncated = geo_date_data.loc[geo_date_data['latitude']>0].copy()
images_with_location_truncated['latitude_truncated'] = images_with_location['latitude'].apply(lambda x: math.trunc(x*truncate_factor)/truncate_factor)
images_with_location_truncated['longitude_truncated'] = images_with_location['longitude'].apply(lambda x: math.trunc(x*truncate_factor)/truncate_factor)
images_with_location_truncated.head(10)
| Unnamed: 0 | filename | latitude | longitude | city | postcode | province | country | distance_km | datestring | day | month | year | latitude_truncated | longitude_truncated | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 20210724_100218.jpg | 41.458438 | 2.204826 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.450968 | 2021:07:24 10:02:18 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 1 | 1 | 20210724_104730.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:31 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 2 | 2 | 20210724_104736.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:36 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 3 | 3 | 20210724_104739.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:39 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 4 | 4 | 20210724_104743.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:44 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 5 | 5 | 20210724_104757.jpg | 41.455110 | 2.202973 | Santa Coloma de Gramenet | 08924 | Barcelona | España | 0.818283 | 2021:07:24 10:47:57 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 6 | 6 | 20210724_200143.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230864 | 2021:07:24 20:01:43 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 7 | 7 | 20210724_200145.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230864 | 2021:07:24 20:01:45 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 8 | 8 | 20210724_200231_02.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230859 | 2021:07:24 20:02:31 | 24 | 7 | 2021 | 41.4 | 2.2 |
| 9 | 9 | 20210724_200231_03.jpg | 41.451868 | 2.208307 | Santa Coloma de Gramenet | 08921 | Barcelona | España | 1.230859 | 2021:07:24 20:02:31 | 24 | 7 | 2021 | 41.4 | 2.2 |
# Se agrupan por latitud y longitud para hacer un mapa de calor
df_grouped_area = images_with_location_truncated.groupby(['latitude_truncated','longitude_truncated']).size().reset_index(name='counts')
df_grouped_area.head(10)
| latitude_truncated | longitude_truncated | counts | |
|---|---|---|---|
| 0 | 40.7 | 0.2 | 55 |
| 1 | 40.8 | 0.1 | 175 |
| 2 | 40.8 | 0.2 | 24 |
| 3 | 40.8 | 0.8 | 7 |
| 4 | 41.1 | 1.5 | 64 |
| 5 | 41.1 | 1.6 | 6 |
| 6 | 41.3 | 2.0 | 4 |
| 7 | 41.3 | 2.1 | 99 |
| 8 | 41.4 | 2.1 | 6 |
| 9 | 41.4 | 2.2 | 133 |
# Se muestran los datos usando un mapbox
fig = px.density_mapbox(df_grouped_area, lat='latitude_truncated', lon='longitude_truncated', z='counts',
mapbox_style="stamen-terrain")
fig
# Se agrupa por día
df_grouped_date = geo_date_data.groupby(['year','month','day']).size().reset_index(name='counts')
df_grouped_date['date'] = df_grouped_date.apply(lambda x: str(x.year) +'_'+ str(x.month) +'_'+ str(x.day),axis=1)
df_grouped_date.sort_values(by='counts', ascending=False).head(10)
| year | month | day | counts | date | |
|---|---|---|---|---|---|
| 24 | 2021 | 9 | 3 | 141 | 2021_9_3 |
| 17 | 2021 | 8 | 27 | 121 | 2021_8_27 |
| 25 | 2021 | 9 | 4 | 87 | 2021_9_4 |
| 5 | 2021 | 7 | 31 | 85 | 2021_7_31 |
| 80 | 2022 | 8 | 23 | 61 | 2022_8_23 |
| 21 | 2021 | 8 | 31 | 45 | 2021_8_31 |
| 22 | 2021 | 9 | 1 | 44 | 2021_9_1 |
| 78 | 2022 | 8 | 21 | 43 | 2022_8_21 |
| 14 | 2021 | 8 | 23 | 39 | 2021_8_23 |
| 20 | 2021 | 8 | 30 | 34 | 2021_8_30 |
# Se realiza un histograma del 2021
df_grouped_date_2021 = df_grouped_date.loc[df_grouped_date['year']==2021]
fig, ax = plt.subplots(figsize=(20,5))
fig.suptitle('Number of images per day (2021)')
ax.bar(df_grouped_date_2021['date'],df_grouped_date_2021['counts'])
plt.xticks(rotation=30, ha='right')
plt.show()
Se observa como la frecuencia de fotografias aumenta de forma muy significativa en periodos de vacaciones y en fechas que coinciden con eventos como cenas, comidas y/o cumpleaños.